// Copyright (c) 2021, gottingen group.
// All rights reserved.
// Created by liyinbin lijippy@163.com

#include "abel/random/poisson_distribution.h"

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <iterator>
#include <random>
#include <sstream>
#include <string>
#include <vector>

#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "abel/log/logging.h"
#include "abel/base/profile.h"
#include "abel/container/flat_hash_map.h"
#include "testing/chi_square.h"
#include "testing/distribution_test_util.h"
#include "abel/random/engine/sequence_urbg.h"
#include "abel/random/random.h"
#include "abel/strings/str_cat.h"
#include "abel/strings/format.h"
#include "abel/strings/str_replace.h"
#include "abel/strings/strip.h"

// Notes about generating poisson variates:
//
// It is unlikely that any implementation of std::poisson_distribution
// will be stable over time and across library implementations. For instance
// the three different poisson variate generators listed below all differ:
//
// https://github.com/ampl/gsl/tree/master/randist/poisson.c
// * GSL uses a gamma + binomial + knuth method to compute poisson variates.
//
// https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/include/bits/random.tcc
// * GCC uses the Devroye rejection algorithm, based on
// Devroye, L. Non-uniform Random Variates Generation. Springer-Verlag,
// New York, 1986, Ch. X, Sects. 3.3 & 3.4 (+ Errata!), ~p.511
//   http://www.nrbook.com/devroye/
//
// https://github.com/llvm-mirror/libcxx/blob/master/include/random
// * CLANG uses a different rejection method, which appears to include a
// normal-distribution approximation and an exponential distribution to
// compute the threshold, including a similar factorial approximation to this
// one, but it is unclear where the algorithm comes from, exactly.
//

namespace {

    using abel::random_internal::kChiSquared;

// The PoissonDistributionInterfaceTest provides a basic test that
// abel::poisson_distribution conforms to the interface and serialization
// requirements imposed by [rand.req.dist] for the common integer types.

    template<typename IntType>
    class PoissonDistributionInterfaceTest : public ::testing::Test {
    };

    using IntTypes = ::testing::Types<int, int8_t, int16_t, int32_t, int64_t,
            uint8_t, uint16_t, uint32_t, uint64_t>;
    TYPED_TEST_CASE
    (PoissonDistributionInterfaceTest, IntTypes);

    TYPED_TEST(PoissonDistributionInterfaceTest, SerializeTest) {
        using param_type = typename abel::poisson_distribution<TypeParam>::param_type;
        const double kMax =
                std::min(1e10 /* assertion limit */,
                         static_cast<double>(std::numeric_limits<TypeParam>::max()));

        const double kParams[] = {
                // Cases around 1.
                1,                         //
                std::nextafter(1.0, 0.0),  // 1 - epsilon
                std::nextafter(1.0, 2.0),  // 1 + epsilon
                // Arbitrary values.
                1e-8, 1e-4,
                0.0000005,  // ~7.2e-7
                0.2,        // ~0.2x
                0.5,        // 0.72
                2,          // ~2.8
                20,         // 3x ~9.6
                100, 1e4, 1e8, 1.5e9, 1e20,
                // Boundary cases.
                std::numeric_limits<double>::max(),
                std::numeric_limits<double>::epsilon(),
                std::nextafter(std::numeric_limits<double>::min(),
                               1.0),                        // min + epsilon
                std::numeric_limits<double>::min(),         // smallest normal
                std::numeric_limits<double>::denorm_min(),  // smallest denorm
                std::numeric_limits<double>::min() / 2,     // denorm
                std::nextafter(std::numeric_limits<double>::min(),
                               0.0),  // denorm_max
        };


        constexpr int kCount = 1000;
        abel::insecure_bit_gen gen;
        for (const double m : kParams) {
            const double mean = std::min(kMax, m);
            const param_type param(mean);

            // Validate parameters.
            abel::poisson_distribution<TypeParam> before(mean);
            EXPECT_EQ(before.mean(), param.mean());

            {
                abel::poisson_distribution<TypeParam> via_param(param);
                EXPECT_EQ(via_param, before);
                EXPECT_EQ(via_param.param(), before.param());
            }

            // Smoke test.
            auto sample_min = before.max();
            auto sample_max = before.min();
            for (int i = 0; i < kCount; i++) {
                auto sample = before(gen);
                EXPECT_GE(sample, before.min());
                EXPECT_LE(sample, before.max());
                if (sample > sample_max) sample_max = sample;
                if (sample < sample_min) sample_min = sample;
            }

            DLOG_INFO(abel::string_cat("Range {", param.mean(), "}: ",
                                                     +sample_min, ", ", +sample_max));

            // Validate stream serialization.
            std::stringstream ss;
            ss << before;

            abel::poisson_distribution<TypeParam> after(3.8);

            EXPECT_NE(before.mean(), after.mean());
            EXPECT_NE(before.param(), after.param());
            EXPECT_NE(before, after);

            ss >> after;

            EXPECT_EQ(before.mean(), after.mean())  //
                                                            << ss.str() << " "                  //
                                                            << (ss.good() ? "good " : "")       //
                                                            << (ss.bad() ? "bad " : "")         //
                                                            << (ss.eof() ? "eof " : "")         //
                                                            << (ss.fail() ? "fail " : "");
        }
    }

// See http://www.itl.nist.gov/div898/handbook/eda/section3/eda366j.htm

    class PoissonModel {
    public:
        explicit PoissonModel(double mean) : mean_(mean) {}

        double mean() const { return mean_; }

        double variance() const { return mean_; }

        double stddev() const { return std::sqrt(variance()); }

        double skew() const { return 1.0 / mean_; }

        double kurtosis() const { return 3.0 + 1.0 / mean_; }

        // InitCDF() initializes the CDF for the distribution parameters.
        void InitCDF();

        // The InverseCDF, or the Percent-point function returns x, P(x) < v.
        struct CDF {
            size_t index;
            double pmf;
            double cdf;
        };

        CDF InverseCDF(double p) {
            CDF target{0, 0, p};
            auto it = std::upper_bound(
                    std::begin(cdf_), std::end(cdf_), target,
                    [](const CDF &a, const CDF &b) { return a.cdf < b.cdf; });
            return *it;
        }

        void LogCDF() {
            DLOG_INFO(abel::string_cat("CDF (mean = ", mean_, ")"));
            for (const auto c : cdf_) {
                DLOG_INFO(abel::string_cat(c.index, ": pmf=", c.pmf, " cdf=", c.cdf));
            }
        }

    private:
        const double mean_;

        std::vector<CDF> cdf_;
    };

// The goal is to compute an InverseCDF function, or percent point function for
// the poisson distribution, and use that to partition our output into equal
// range buckets.  However there is no closed form solution for the inverse cdf
// for poisson distributions (the closest is the incomplete gamma function).
// Instead, `InitCDF` iteratively computes the PMF and the CDF. This enables
// searching for the bucket points.
    void PoissonModel::InitCDF() {
        if (!cdf_.empty()) {
            // State already initialized.
            return;
        }
        ABEL_ASSERT(mean_ < 201.0);

        const size_t max_i = 50 * stddev() + mean();
        const double e_neg_mean = std::exp(-mean());
        ABEL_ASSERT(e_neg_mean > 0);

        double d = 1;
        double last_result = e_neg_mean;
        double cumulative = e_neg_mean;
        if (e_neg_mean > 1e-10) {
            cdf_.push_back({0, e_neg_mean, cumulative});
        }
        for (size_t i = 1; i < max_i; i++) {
            d *= (mean() / i);
            double result = e_neg_mean * d;
            cumulative += result;
            if (result < 1e-10 && result < last_result && cumulative > 0.999999) {
                break;
            }
            if (result > 1e-7) {
                cdf_.push_back({i, result, cumulative});
            }
            last_result = result;
        }
        ABEL_ASSERT(!cdf_.empty());
    }

// PoissonDistributionZTest implements a z-test for the poisson distribution.

    struct ZParam {
        double mean;
        double p_fail;   // Z-Test probability of failure.
        int trials;      // Z-Test trials.
        size_t samples;  // Z-Test samples.
    };

    class PoissonDistributionZTest : public testing::TestWithParam<ZParam>,
                                     public PoissonModel {
    public:
        PoissonDistributionZTest() : PoissonModel(GetParam().mean) {}

        // ZTestImpl provides a basic z-squared test of the mean vs. expected
        // mean for data generated by the poisson distribution.
        template<typename D>
        bool SingleZTest(const double p, const size_t samples);

        abel::insecure_bit_gen rng_;
    };

    template<typename D>
    bool PoissonDistributionZTest::SingleZTest(const double p,
                                               const size_t samples) {
        D dis(mean());

        abel::flat_hash_map<int32_t, int> buckets;
        std::vector<double> data;
        data.reserve(samples);
        for (size_t j = 0; j < samples; j++) {
            const auto x = dis(rng_);
            buckets[x]++;
            data.push_back(x);
        }

        // The null-hypothesis is that the distribution is a poisson distribution with
        // the provided mean (not estimated from the data).
        const auto m = abel::random_internal::ComputeDistributionMoments(data);
        const double max_err = abel::random_internal::MaxErrorTolerance(p);
        const double z = abel::random_internal::ZScore(mean(), m);
        const bool pass = abel::random_internal::Near("z", z, 0.0, max_err);

        if (!pass) {
            DLOG_INFO(abel::sprintf("p=%f max_err=%f\n"
                                       " mean=%f vs. %f\n"
                                       " stddev=%f vs. %f\n"
                                       " skewness=%f vs. %f\n"
                                       " kurtosis=%f vs. %f\n"
                                       " z=%f",
                                       p, max_err, m.mean, mean(), std::sqrt(m.variance),
                                       stddev(), m.skewness, skew(), m.kurtosis,
                                       kurtosis(), z));
        }
        return pass;
    }

    TEST_P(PoissonDistributionZTest, AbelPoissonDistribution) {
        const auto &param = GetParam();
        const int expected_failures =
                std::max(1, static_cast<int>(std::ceil(param.trials * param.p_fail)));
        const double p = abel::random_internal::RequiredSuccessProbability(
                param.p_fail, param.trials);

        int failures = 0;
        for (int i = 0; i < param.trials; i++) {
            failures +=
                    SingleZTest<abel::poisson_distribution<int32_t>>(p, param.samples) ? 0
                                                                                       : 1;
        }
        EXPECT_LE(failures, expected_failures);
    }

    std::vector<ZParam> GetZParams() {
        // These values have been adjusted from the "exact" computed values to reduce
        // failure rates.
        //
        // It turns out that the actual values are not as close to the expected values
        // as would be ideal.
        return std::vector<ZParam>({
                                           // Knuth method.
                                           ZParam{0.5, 0.01, 100, 1000},
                                           ZParam{1.0, 0.01, 100, 1000},
                                           ZParam{10.0, 0.01, 100, 5000},
                                           // Split-knuth method.
                                           ZParam{20.0, 0.01, 100, 10000},
                                           ZParam{50.0, 0.01, 100, 10000},
                                           // Ratio of gaussians method.
                                           ZParam{51.0, 0.01, 100, 10000},
                                           ZParam{200.0, 0.05, 10, 100000},
                                           ZParam{100000.0, 0.05, 10, 1000000},
                                   });
    }

    std::string ZParamName(const ::testing::TestParamInfo<ZParam> &info) {
        const auto &p = info.param;
        std::string name = abel::string_cat("mean_", abel::SixDigits(p.mean));
        return abel::string_replace_all(name, {{"+", "_"},
                                               {"-", "_"},
                                               {".", "_"}});
    }

    INSTANTIATE_TEST_SUITE_P(All, PoissonDistributionZTest,
                             ::testing::ValuesIn(GetZParams()), ZParamName);

// The PoissonDistributionChiSquaredTest class provides a basic test framework
// for variates generated by a conforming poisson_distribution.
    class PoissonDistributionChiSquaredTest : public testing::TestWithParam<double>,
                                              public PoissonModel {
    public:
        PoissonDistributionChiSquaredTest() : PoissonModel(GetParam()) {}

        // The ChiSquaredTestImpl provides a chi-squared goodness of fit test for data
        // generated by the poisson distribution.
        template<typename D>
        double ChiSquaredTestImpl();

    private:
        void InitChiSquaredTest(const double buckets);

        abel::insecure_bit_gen rng_;
        std::vector<size_t> cutoffs_;
        std::vector<double> expected_;
    };

    void PoissonDistributionChiSquaredTest::InitChiSquaredTest(
            const double buckets) {
        if (!cutoffs_.empty() && !expected_.empty()) {
            return;
        }
        InitCDF();

        // The code below finds cuttoffs that yield approximately equally-sized
        // buckets to the extent that it is possible. However for poisson
        // distributions this is particularly challenging for small mean parameters.
        // Track the expected proportion of items in each bucket.
        double last_cdf = 0;
        const double inc = 1.0 / buckets;
        for (double p = inc; p <= 1.0; p += inc) {
            auto result = InverseCDF(p);
            if (!cutoffs_.empty() && cutoffs_.back() == result.index) {
                continue;
            }
            double d = result.cdf - last_cdf;
            cutoffs_.push_back(result.index);
            expected_.push_back(d);
            last_cdf = result.cdf;
        }
        cutoffs_.push_back(std::numeric_limits<size_t>::max());
        expected_.push_back(std::max(0.0, 1.0 - last_cdf));
    }

    template<typename D>
    double PoissonDistributionChiSquaredTest::ChiSquaredTestImpl() {
        const int kSamples = 2000;
        const int kBuckets = 50;

        // The poisson CDF fails for large mean values, since e^-mean exceeds the
        // machine precision. For these cases, using a normal approximation would be
        // appropriate.
        ABEL_ASSERT(mean() <= 200);
        InitChiSquaredTest(kBuckets);

        D dis(mean());

        std::vector<int32_t> counts(cutoffs_.size(), 0);
        for (int j = 0; j < kSamples; j++) {
            const size_t x = dis(rng_);
            auto it = std::lower_bound(std::begin(cutoffs_), std::end(cutoffs_), x);
            counts[std::distance(cutoffs_.begin(), it)]++;
        }

        // Normalize the counts.
        std::vector<int32_t> e(expected_.size(), 0);
        for (size_t i = 0; i < e.size(); i++) {
            e[i] = kSamples * expected_[i];
        }

        // The null-hypothesis is that the distribution is a poisson distribution with
        // the provided mean (not estimated from the data).
        const int dof = static_cast<int>(counts.size()) - 1;

        // The threshold for logging is 1-in-50.
        const double threshold = abel::random_internal::chi_square_value(dof, 0.98);

        const double chi_square = abel::random_internal::chi_square(
                std::begin(counts), std::end(counts), std::begin(e), std::end(e));

        const double p = abel::random_internal::chi_square_p_value(chi_square, dof);

        // Log if the chi_squared value is above the threshold.
        if (chi_square > threshold) {
            LogCDF();

            DLOG_INFO(abel::string_cat("VALUES  buckets=", counts.size(),
                                                     "  samples=", kSamples));
            for (size_t i = 0; i < counts.size(); i++) {
                DLOG_INFO(abel::string_cat(cutoffs_[i], ": ", counts[i], " vs. E=", e[i]));
            }

            DLOG_INFO(abel::string_cat(kChiSquared, "(data, dof=", dof, ") = ", chi_square, " (",
                                     p, ")\n", " vs.\n", kChiSquared, " @ 0.98 = ", threshold));
        }
        return p;
    }

    TEST_P(PoissonDistributionChiSquaredTest, AbelPoissonDistribution) {
        const int kTrials = 20;

        // Large values are not yet supported -- this requires estimating the cdf
        // using the normal distribution instead of the poisson in this case.
        ASSERT_LE(mean(), 200.0);
        if (mean() > 200.0) {
            return;
        }

        int failures = 0;
        for (int i = 0; i < kTrials; i++) {
            double p_value = ChiSquaredTestImpl<abel::poisson_distribution<int32_t>>();
            if (p_value < 0.005) {
                failures++;
            }
        }
        // There is a 0.10% chance of producing at least one failure, so raise the
        // failure threshold high enough to allow for a flake rate < 10,000.
        EXPECT_LE(failures, 4);
    }

    INSTANTIATE_TEST_SUITE_P(All, PoissonDistributionChiSquaredTest,
                             ::testing::Values(0.5, 1.0, 2.0, 10.0, 50.0, 51.0,
                                               200.0));

// NOTE: abel::poisson_distribution is not guaranteed to be stable.
    TEST(PoissonDistributionTest, StabilityTest) {
        using testing::ElementsAre;
        // abel::poisson_distribution stability relies on stability of
        // std::exp, std::log, std::sqrt, std::ceil, std::floor, and
        // abel::fast_uniform_bits, abel::StirlingLogFactorial, abel::RandU64ToDouble.
        abel::random_internal::sequence_urbg urbg({
                                                          0x035b0dc7e0a18acfull, 0x06cebe0d2653682eull,
                                                          0x0061e9b23861596bull,
                                                          0x0003eb76f6f7f755ull, 0xFFCEA50FDB2F953Bull,
                                                          0xC332DDEFBE6C5AA5ull,
                                                          0x6558218568AB9702ull, 0x2AEF7DAD5B6E2F84ull,
                                                          0x1521B62829076170ull,
                                                          0xECDD4775619F1510ull, 0x13CCA830EB61BD96ull,
                                                          0x0334FE1EAA0363CFull,
                                                          0xB5735C904C70A239ull, 0xD59E9E0BCBAADE14ull,
                                                          0xEECC86BC60622CA7ull,
                                                          0x4864f22c059bf29eull, 0x247856d8b862665cull,
                                                          0xe46e86e9a1337e10ull,
                                                          0xd8c8541f3519b133ull, 0xe75b5162c567b9e4ull,
                                                          0xf732e5ded7009c5bull,
                                                          0xb170b98353121eacull, 0x1ec2e8986d2362caull,
                                                          0x814c8e35fe9a961aull,
                                                          0x0c3cd59c9b638a02ull, 0xcb3bb6478a07715cull,
                                                          0x1224e62c978bbc7full,
                                                          0x671ef2cb04e81f6eull, 0x3c1cbd811eaf1808ull,
                                                          0x1bbc23cfa8fac721ull,
                                                          0xa4c2cda65e596a51ull, 0xb77216fad37adf91ull,
                                                          0x836d794457c08849ull,
                                                          0xe083df03475f49d7ull, 0xbc9feb512e6b0d6cull,
                                                          0xb12d74fdd718c8c5ull,
                                                          0x12ff09653bfbe4caull, 0x8dd03a105bc4ee7eull,
                                                          0x5738341045ba0d85ull,
                                                          0xf3fd722dc65ad09eull, 0xfa14fd21ea2a5705ull,
                                                          0xffe6ea4d6edb0c73ull,
                                                          0xD07E9EFE2BF11FB4ull, 0x95DBDA4DAE909198ull,
                                                          0xEAAD8E716B93D5A0ull,
                                                          0xD08ED1D0AFC725E0ull, 0x8E3C5B2F8E7594B7ull,
                                                          0x8FF6E2FBF2122B64ull,
                                                          0x8888B812900DF01Cull, 0x4FAD5EA0688FC31Cull,
                                                          0xD1CFF191B3A8C1ADull,
                                                          0x2F2F2218BE0E1777ull, 0xEA752DFE8B021FA1ull,
                                                          0xE5A0CC0FB56F74E8ull,
                                                          0x18ACF3D6CE89E299ull, 0xB4A84FE0FD13E0B7ull,
                                                          0x7CC43B81D2ADA8D9ull,
                                                          0x165FA26680957705ull, 0x93CC7314211A1477ull,
                                                          0xE6AD206577B5FA86ull,
                                                          0xC75442F5FB9D35CFull, 0xEBCDAF0C7B3E89A0ull,
                                                          0xD6411BD3AE1E7E49ull,
                                                          0x00250E2D2071B35Eull, 0x226800BB57B8E0AFull,
                                                          0x2464369BF009B91Eull,
                                                          0x5563911D59DFA6AAull, 0x78C14389D95A537Full,
                                                          0x207D5BA202E5B9C5ull,
                                                          0x832603766295CFA9ull, 0x11C819684E734A41ull,
                                                          0xB3472DCA7B14A94Aull,
                                                  });

        std::vector<int> output(10);

        // Method 1.
        {
            abel::poisson_distribution<int> dist(5);
            std::generate(std::begin(output), std::end(output),
                          [&] { return dist(urbg); });
        }
        EXPECT_THAT(output,  // mean = 4.2
                    ElementsAre(1, 0, 0, 4, 2, 10, 3, 3, 7, 12));

        // Method 2.
        {
            urbg.reset();
            abel::poisson_distribution<int> dist(25);
            std::generate(std::begin(output), std::end(output),
                          [&] { return dist(urbg); });
        }
        EXPECT_THAT(output,  // mean = 19.8
                    ElementsAre(9, 35, 18, 10, 35, 18, 10, 35, 18, 10));

        // Method 3.
        {
            urbg.reset();
            abel::poisson_distribution<int> dist(121);
            std::generate(std::begin(output), std::end(output),
                          [&] { return dist(urbg); });
        }
        EXPECT_THAT(output,  // mean = 124.1
                    ElementsAre(161, 122, 129, 124, 112, 112, 117, 120, 130, 114));
    }

    TEST(PoissonDistributionTest, AlgorithmExpectedValue_1) {
        // This tests small values of the Knuth method.
        // The underlying uniform distribution will generate exactly 0.5.
        abel::random_internal::sequence_urbg urbg({0x8000000000000001ull});
        abel::poisson_distribution<int> dist(5);
        EXPECT_EQ(7, dist(urbg));
    }

    TEST(PoissonDistributionTest, AlgorithmExpectedValue_2) {
        // This tests larger values of the Knuth method.
        // The underlying uniform distribution will generate exactly 0.5.
        abel::random_internal::sequence_urbg urbg({0x8000000000000001ull});
        abel::poisson_distribution<int> dist(25);
        EXPECT_EQ(36, dist(urbg));
    }

    TEST(PoissonDistributionTest, AlgorithmExpectedValue_3) {
        // This variant uses the ratio of uniforms method.
        abel::random_internal::sequence_urbg urbg(
                {0x7fffffffffffffffull, 0x8000000000000000ull});

        abel::poisson_distribution<int> dist(121);
        EXPECT_EQ(121, dist(urbg));
    }

}  // namespace
